Roey Angel 2021-03-02
set.seed(2021)
prev_thresh <- 0.1 # remove ASVs with prevalence less than that
samples_prep_path <- "./"
data_path <- "./DADA2_pseudo/"
# Metadata_table <- "./AnCUE_Metadata_decontam.csv"
# Seq_table <- "DADA2.seqtab_nochim_decontam.tsv"
# Seq_file <- "DADA2.Seqs_decontam.fa"
Ps_file <- "Ps_obj_decontam.Rds"
Seq_file <- "DADA2.Seqs_decontam.fa"Also remove controls
readRDS(paste0(data_path, Ps_file)) ->
Ps_obj
Ps_obj %>%
subset_samples(., Hours == 0) ->
Ps_obj_t0
Ps_obj %>%
subset_samples(., Hours != 0 & !is.na(Site)) ->
Ps_obj_SIP Combine repeated runs
sample_data(Ps_obj_SIP) %<>%
as("data.frame") %>%
rownames_to_column() %>%
mutate(Identifier = paste(Site, Oxygen, Glucose, Label..13C., Hours, Fraction.no.,
sep = "_")) %>%
column_to_rownames("rowname")
phyloseq_merge_samples(Ps_obj_SIP, "Identifier") %>%
filter_taxa(., function(x) sum(x) > 0, TRUE) ->
Ps_obj_SIP_merged
# Compute lib sizes
sample_data(Ps_obj_SIP_merged)$Lib.size <- rowSums(otu_table(Ps_obj_SIP_merged))First let us look at the count data distribution
I will test now the effect of library size and all other experimental factors on the community composition and also plot
(mod1 <- adonis(vegdist(otu_table(Ps_obj_SIP_merged), method = "bray") ~ Site * Oxygen * Hours + Lib.size,
data = as(sample_data(Ps_obj_SIP_merged), "data.frame"),
permutations = 999
))##
## Call:
## adonis(formula = vegdist(otu_table(Ps_obj_SIP_merged), method = "bray") ~ Site * Oxygen * Hours + Lib.size, data = as(sample_data(Ps_obj_SIP_merged), "data.frame"), permutations = 999)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## Site 1 22.555 22.5553 314.924 0.39126 0.001 ***
## Oxygen 1 6.377 6.3767 89.033 0.11061 0.001 ***
## Hours 1 0.333 0.3332 4.652 0.00578 0.002 **
## Lib.size 1 4.943 4.9430 69.016 0.08575 0.001 ***
## Site:Oxygen 1 1.483 1.4830 20.706 0.02573 0.001 ***
## Site:Hours 1 0.471 0.4707 6.572 0.00817 0.002 **
## Oxygen:Hours 1 0.281 0.2809 3.922 0.00487 0.005 **
## Site:Oxygen:Hours 1 0.363 0.3631 5.070 0.00630 0.001 ***
## Residuals 291 20.842 0.0716 0.36154
## Total 299 57.648 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Modelling library size shows a significant effect of read depth on the community structure, but explaining only 9% of the variance. The reads histogram shows as expected a highly sparse and skewed sequence matrix. The mean vs SD also shows as expected large dependency of SD on the mean reads of a sequence across all samples.
Now let us look at the taxonomic distribution
##
## Archaea Bacteria
## 185 14280
##
## Abditibacteria Acidimicrobiia Acidobacteriae
## 21 679 1199
## Actinobacteria AD3 Alphaproteobacteria
## 784 27 2031
## Anaerolineae Armatimonadia Babeliae
## 3 84 680
## Bacilli Bacteroidia BD7-11
## 459 503 16
## Bdellovibrionia Blastocatellia Campylobacteria
## 110 20 5
## Chlamydiae Chloroflexia Chthonomonadetes
## 583 4 63
## Clostridia Coriobacteriia Cyanobacteriia
## 276 4 178
## Dehalococcoidia Deinococci Desulfitobacteriia
## 1 10 12
## Desulfobulbia Desulfovibrionia Elusimicrobia
## 1 2 14
## Endomicrobia Fibrobacteria Fimbriimonadia
## 2 5 58
## Fusobacteriia Gammaproteobacteria Gemmatimonadetes
## 21 2250 25
## Gitt-GS-136 Gracilibacteria Halobacteria
## 1 1 1
## Holophagae Hydrogenedentia JG30-KF-CM66
## 2 1 25
## Kapabacteria KD4-96 Ktedonobacteria
## 10 1 82
## Lentisphaeria Lineage IIa Longimicrobia
## 2 5 2
## Methanobacteria Micrarchaeia Myxococcia
## 1 4 111
## Negativicutes Nitrososphaeria Oligoflexia
## 13 129 106
## OM190 Omnitrophia Parcubacteria
## 1 11 11
## Phycisphaerae Planctomycetes Polyangia
## 154 1703 146
## Rhodothermia S0134 terrestrial group Saccharimonadia
## 2 1 49
## Sericytochromatia SJA-28 Spirochaetia
## 1 1 1
## Subgroup 22 Subgroup 5 Syntrophobacteria
## 1 12 1
## Thermoanaerobaculia Thermoleophilia Thermoplasmata
## 2 223 50
## TK10 Unclassified vadinHA49
## 10 615 11
## Vampirivibrionia Verrucomicrobiae Vicinamibacteria
## 206 599 27
Accordingly, we will remove some taxa which are obvious artefacts or those which aren’t bacteria or archaea
kingdoms2remove <- c("", "Eukaryota", "Unclassified")
orders2remove <- c("Chloroplast")
families2remove <- c("Mitochondria")
Ps_obj_kingdoms <- tax_glom(Ps_obj_SIP_merged, "Kingdom")
Ps_obj_orders <- tax_glom(Ps_obj_SIP_merged, "Order")
Ps_obj_families <- tax_glom(Ps_obj_SIP_merged, "Family")
Ps_obj_SIP_merged_filt <- subset_taxa(Ps_obj_SIP_merged, !is.na(Phylum) &
!Kingdom %in% kingdoms2remove &
!Order %in% orders2remove &
!Family %in% families2remove)Summary_pruned <- tibble(
Level = c("Kingdom", "Order", "Family"),
ASVs.removed = c(
table(tax_table(Ps_obj)[, "Kingdom"], exclude = NULL) %>% as.data.frame() %>% .[.$Var1 == "" | .$Var1 == "Eukaryota" | .$Var1 == "Unclassified", 2] %>% sum(),
table(tax_table(Ps_obj)[, "Order"], exclude = NULL) %>% as.data.frame() %>% .[.$Var1 == "Chloroplast", 2] %>% sum(),
table(tax_table(Ps_obj)[, "Family"], exclude = NULL) %>% as.data.frame() %>% .[.$Var1 == "Mitochondria", 2] %>% sum()
),
Seqs.removed = c(
psmelt(Ps_obj_kingdoms) %>%
group_by(Kingdom) %>%
filter(Kingdom == "" |
Kingdom == "Eukaryota" | Kingdom == "Unclassified") %>%
summarise(sum = sum(Abundance)) %>% .$sum %>% sum(),
psmelt(Ps_obj_orders) %>%
group_by(Order) %>%
filter(Order == orders2remove) %>%
summarise(sum = sum(Abundance)) %>% .$sum %>% sum(),
psmelt(Ps_obj_families) %>%
group_by(Family) %>%
filter(Family == families2remove) %>%
summarise(sum = sum(Abundance)) %>% .$sum %>% sum()
)
)
Summary_pruned %>%
kable(., digits = c(0, 1, 0)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F)|
Level |
ASVs.removed |
Seqs.removed |
|---|---|---|
|
Kingdom |
0 |
0 |
|
Order |
136 |
4294 |
|
Family |
138 |
15725 |
Removed 0.0335% of the sequences.
Now let’s explore the prevalence of different taxa in the database. Prevalence is the number of samples in which a taxa appears at least once. So “Mean prevalence” refers to in how many samples does a sequence belonging to the phylum appears on average, and “Sum prevalence” is the sum of all samples where any sequence from the taxon appears.
prevdf <- apply(X = otu_table(Ps_obj_SIP_merged_filt),
MARGIN = ifelse(taxa_are_rows(Ps_obj_SIP_merged_filt), yes = 1, no = 2),
FUN = function(x){sum(x > 0)})
# Add taxonomy and total read counts to this data.frame
prevdf <- data.frame(Prevalence = prevdf,
TotalAbundance = taxa_sums(Ps_obj_SIP_merged_filt),
tax_table(Ps_obj_SIP_merged_filt))
prevdf %>%
group_by(Phylum) %>%
summarise(`Mean prevalence` = mean(Prevalence),
`Sum prevalence` = sum(Prevalence)) ->
Prevalence_phylum_summary
Prevalence_phylum_summary %>%
kable(., digits = c(0, 1, 0)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F)|
Phylum |
Mean prevalence |
Sum prevalence |
|---|---|---|
|
Abditibacteriota |
21.3 |
448 |
|
Acidobacteriota |
58.2 |
73619 |
|
Actinobacteriota |
51.3 |
86754 |
|
Armatimonadota |
43.5 |
9178 |
|
Bacteroidota |
20.9 |
10783 |
|
Bdellovibrionota |
7.5 |
1625 |
|
Campilobacterota |
3.6 |
18 |
|
Chloroflexi |
29.5 |
4632 |
|
Crenarchaeota |
38.9 |
5012 |
|
Cyanobacteria |
17.5 |
4458 |
|
Deinococcota |
1.5 |
15 |
|
Dependentiae |
11.7 |
7981 |
|
Desulfobacterota |
6.7 |
40 |
|
Elusimicrobiota |
13.0 |
273 |
|
Euryarchaeota |
1.0 |
1 |
|
FCPU426 |
44.6 |
1606 |
|
Fibrobacterota |
9.0 |
45 |
|
Firmicutes |
22.1 |
18329 |
|
Fusobacteriota |
4.4 |
93 |
|
GAL15 |
40.1 |
281 |
|
Gemmatimonadota |
16.7 |
468 |
|
Halobacterota |
1.0 |
1 |
|
Hydrogenedentes |
1.0 |
1 |
|
Micrarchaeota |
1.5 |
6 |
|
Myxococcota |
46.7 |
11996 |
|
Patescibacteria |
9.0 |
588 |
|
Planctomycetota |
36.0 |
67790 |
|
Proteobacteria |
28.2 |
119347 |
|
RCP2-54 |
105.0 |
8924 |
|
SAR324 clade(Marine group B) |
33.0 |
33 |
|
Spirochaetota |
1.0 |
1 |
|
Thermoplasmatota |
49.3 |
2464 |
|
Unclassified |
10.3 |
1825 |
|
Verrucomicrobiota |
29.7 |
35502 |
|
WPS-2 |
63.1 |
8456 |
prevdf %>%
group_by(Order) %>%
summarise(`Mean prevalence` = mean(Prevalence),
`Sum prevalence` = sum(Prevalence)) ->
Prevalence_order_summary
Prevalence_order_summary %>%
kable(., digits = c(0, 1, 0)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F)|
Order |
Mean prevalence |
Sum prevalence |
|---|---|---|
|
0319-6G20 |
8.2 |
655 |
|
11-24 |
7.5 |
15 |
|
Abditibacteriales |
21.3 |
448 |
|
Absconditabacteriales (SR1) |
2.0 |
2 |
|
Acetobacterales |
69.1 |
16435 |
|
Acholeplasmatales |
1.0 |
1 |
|
Acidaminococcales |
1.0 |
1 |
|
Acidimicrobiales |
1.0 |
1 |
|
Acidobacteriales |
51.6 |
30887 |
|
Actinomycetales |
3.5 |
38 |
|
Aeromonadales |
1.0 |
540 |
|
Alicyclobacillales |
26.6 |
186 |
|
Alteromonadales |
1.0 |
1 |
|
Ardenticatenales |
1.0 |
2 |
|
Armatimonadales |
43.8 |
3676 |
|
Azospirillales |
38.2 |
992 |
|
B12-WMSP1 |
1.0 |
1 |
|
Babeliales |
11.7 |
7981 |
|
Bacillales |
35.0 |
3151 |
|
Bacteriovoracales |
1.2 |
5 |
|
Bacteroidales |
2.0 |
117 |
|
Bacteroidetes VC2.1 Bac22 |
1.5 |
3 |
|
Bdellovibrionales |
6.5 |
687 |
|
Bifidobacteriales |
1.0 |
3 |
|
Blastocatellales |
1.3 |
22 |
|
Blfdi19 |
22.5 |
45 |
|
Bryobacterales |
76.7 |
11734 |
|
Burkholderiales |
23.8 |
6741 |
|
Caedibacterales |
11.7 |
82 |
|
Caldalkalibacillales |
3.5 |
7 |
|
Campylobacterales |
3.6 |
18 |
|
Candidatus Jorgensenbacteria |
3.8 |
23 |
|
Candidatus Kaiserbacteria |
1.0 |
1 |
|
Cardiobacteriales |
4.3 |
13 |
|
Catenulisporales |
97.8 |
2151 |
|
Caulobacterales |
57.1 |
9421 |
|
Cellvibrionales |
1.0 |
2 |
|
Chitinophagales |
34.7 |
5136 |
|
Chlamydiales |
17.1 |
9989 |
|
Chloroflexales |
1.0 |
1 |
|
Christensenellales |
1.4 |
18 |
|
Chthoniobacterales |
26.0 |
4752 |
|
Chthonomonadales |
19.0 |
1199 |
|
Clostridia UCG-014 |
1.0 |
2 |
|
Clostridiales |
16.8 |
606 |
|
Coriobacteriales |
1.2 |
5 |
|
Corynebacteriales |
41.8 |
5059 |
|
Coxiellales |
13.5 |
3232 |
|
Cyanobacteriales |
3.9 |
126 |
|
Cytophagales |
5.1 |
471 |
|
Deinococcales |
1.5 |
15 |
|
Desulfitobacteriales |
23.2 |
279 |
|
Desulfobulbales |
1.0 |
1 |
|
Desulfovibrionales |
1.0 |
2 |
|
Diplorickettsiales |
10.1 |
2318 |
|
Dongiales |
2.0 |
2 |
|
Elev-1554 |
14.5 |
29 |
|
Elev-16S-1166 |
1.0 |
1 |
|
Elsterales |
57.4 |
19463 |
|
Endomicrobiales |
1.0 |
2 |
|
Enterobacterales |
4.5 |
150 |
|
Entomoplasmatales |
8.2 |
41 |
|
Erysipelotrichales |
6.7 |
100 |
|
Exiguobacterales |
1.0 |
3 |
|
FCPU453 |
32.5 |
130 |
|
Fibrobacterales |
9.0 |
45 |
|
Fimbriimonadales |
70.2 |
4072 |
|
Flavobacteriales |
2.6 |
124 |
|
Frankiales |
83.3 |
28396 |
|
Fusobacteriales |
4.4 |
93 |
|
Gaiellales |
71.8 |
933 |
|
Gammaproteobacteria Incertae Sedis |
32.0 |
6088 |
|
Gastranaerophilales |
1.0 |
3 |
|
Gemmatales |
34.4 |
49765 |
|
Gemmatimonadales |
18.6 |
465 |
|
Group 1.1c |
39.2 |
3335 |
|
Haliangiales |
37.5 |
488 |
|
Halobacterales |
1.0 |
1 |
|
Holosporales |
22.9 |
1280 |
|
Hungateiclostridiaceae |
1.0 |
2 |
|
Hydrogenedentiales |
1.0 |
1 |
|
I3A |
10.6 |
53 |
|
IMCC26256 |
37.0 |
7954 |
|
Isosphaerales |
55.7 |
5624 |
|
JG36-TzT-191 |
82.3 |
3375 |
|
Kapabacteriales |
38.3 |
383 |
|
KF-JG30-C25 |
20.0 |
60 |
|
Kineosporiales |
1.0 |
5 |
|
Ktedonobacterales |
27.6 |
2232 |
|
Lachnospirales |
5.6 |
767 |
|
Lactobacillales |
14.4 |
1209 |
|
Legionellales |
19.7 |
3175 |
|
Leptolyngbyales |
1.0 |
1 |
|
Lineage IV |
10.9 |
98 |
|
Longimicrobiales |
1.0 |
2 |
|
Methanobacteriales |
1.0 |
1 |
|
Methanomassiliicoccales |
1.0 |
2 |
|
Methylacidiphilales |
33.8 |
2163 |
|
Methylococcales |
9.0 |
18 |
|
Micavibrionales |
57.0 |
57 |
|
Micrarchaeales |
1.5 |
6 |
|
Micrococcales |
7.4 |
891 |
|
Micromonosporales |
1.5 |
3 |
|
Micropepsales |
56.0 |
4645 |
|
Microtrichales |
2.1 |
27 |
|
mle1-27 |
11.0 |
44 |
|
Monoglobales |
1.0 |
4 |
|
Myxococcales |
22.2 |
2464 |
|
Nitrosococcales |
37.0 |
74 |
|
Nitrososphaerales |
60.5 |
1150 |
|
Nitrosotaleales |
25.6 |
436 |
|
Obscuribacterales |
21.2 |
4153 |
|
Oceanospirillales |
2.7 |
8 |
|
Oligoflexales |
16.6 |
216 |
|
Omnitrophales |
25.5 |
281 |
|
Opitutales |
38.4 |
3957 |
|
Oscillospirales |
4.6 |
146 |
|
Oxyphotobacteria Incertae Sedis |
1.0 |
2 |
|
Paenibacillales |
53.1 |
8012 |
|
Paracaedibacterales |
11.6 |
792 |
|
Pasteurellales |
14.1 |
127 |
|
Pedosphaerales |
70.4 |
13805 |
|
Peptococcales |
1.0 |
1 |
|
Peptostreptococcales-Tissierellales |
6.4 |
256 |
|
Phormidesmiales |
1.0 |
2 |
|
Phycisphaerales |
21.0 |
524 |
|
Pirellulales |
34.5 |
4455 |
|
Planctomycetales |
85.8 |
2230 |
|
Polyangiales |
70.8 |
8783 |
|
Propionibacteriales |
10.5 |
221 |
|
Pseudanabaenales |
1.0 |
1 |
|
Pseudomonadales |
15.1 |
1782 |
|
Pseudonocardiales |
36.8 |
699 |
|
Pyrinomonadales |
1.0 |
1 |
|
Reyranellales |
29.3 |
352 |
|
Rhizobiales |
62.3 |
23562 |
|
Rhodobacterales |
2.6 |
84 |
|
Rhodospirillales |
13.8 |
1658 |
|
Rhodothermales |
1.0 |
2 |
|
Rickettsiales |
15.6 |
1485 |
|
S-BQ2-57 soil group |
11.8 |
177 |
|
S085 |
1.0 |
1 |
|
Saccharimonadales |
11.0 |
537 |
|
Salinisphaerales |
48.2 |
1013 |
|
SAR11 clade |
1.0 |
1 |
|
SBR1031 |
1.0 |
1 |
|
Silvanigrellales |
4.8 |
62 |
|
Solibacterales |
89.8 |
10771 |
|
Solirubrobacterales |
75.9 |
15946 |
|
Sphingobacteriales |
29.4 |
4468 |
|
Sphingomonadales |
4.6 |
705 |
|
Spirochaetales |
1.0 |
1 |
|
Staphylococcales |
21.6 |
410 |
|
Steroidobacterales |
1.0 |
1 |
|
Streptomycetales |
63.1 |
883 |
|
Streptosporangiales |
2.9 |
105 |
|
Subgroup 12 |
10.8 |
43 |
|
Subgroup 13 |
61.6 |
1725 |
|
Subgroup 15 |
5.1 |
72 |
|
Subgroup 2 |
73.8 |
16985 |
|
Subgroup 7 |
1.0 |
2 |
|
Synechococcales |
1.0 |
2 |
|
Syntrophobacterales |
35.0 |
35 |
|
Tepidisphaerales |
35.1 |
4533 |
|
Thalassobaculales |
1.0 |
1 |
|
Thermicanales |
5.0 |
10 |
|
Thermoactinomycetales |
30.0 |
1951 |
|
Thermoanaerobaculales |
15.5 |
31 |
|
Thermomicrobiales |
1.0 |
1 |
|
TSBb06 |
10.2 |
61 |
|
Unclassified |
31.5 |
56145 |
|
Vampirovibrionales |
17.3 |
104 |
|
Veillonellales-Selenomonadales |
6.7 |
80 |
|
Verrucomicrobiales |
10.1 |
375 |
|
Vibrionales |
1.7 |
5 |
|
Vicinamibacterales |
31.3 |
846 |
|
Victivallales |
1.0 |
2 |
|
WD260 |
158.8 |
3335 |
|
Xanthomonadales |
29.6 |
2336 |
Based on that I’ll remove all orders with a sum prevalence of under 5% (15) of all samples
Prevalence_order_summary %>%
filter(`Sum prevalence` < (0.05 * nsamples(Ps_obj_SIP_merged_filt))) %>%
dplyr::select(Order) %>%
map(as.character) %>%
unlist() ->
filterOrder
Ps_obj_SIP_merged_filt2 <- subset_taxa(Ps_obj_SIP_merged_filt, !Order %in% filterOrder)
sample_data(Ps_obj_SIP_merged_filt2)$Lib.size <- rowSums(otu_table(Ps_obj_SIP_merged_filt2))
print(Ps_obj_SIP_merged_filt)## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 14199 taxa and 300 samples ]:
## sample_data() Sample Data: [ 300 samples by 29 sample variables ]:
## tax_table() Taxonomy Table: [ 14199 taxa by 6 taxonomic ranks ]:
## taxa are columns
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 14099 taxa and 300 samples ]:
## sample_data() Sample Data: [ 300 samples by 29 sample variables ]:
## tax_table() Taxonomy Table: [ 14099 taxa by 6 taxonomic ranks ]:
## taxa are columns
This removed 100 or 1% of the ESVs, and 0.046% of the reads.
Plot general prevalence features of the phyla
# Subset to the remaining phyla
prevdf_phylum_filt <- subset(prevdf,
Phylum %in% get_taxa_unique(Ps_obj_SIP_merged_filt2,
"Phylum"))
ggplot(prevdf_phylum_filt,
aes(TotalAbundance,
Prevalence / nsamples(Ps_obj_SIP_merged_filt2),
color = Phylum)) +
# Include a guess for parameter
geom_hline(yintercept = 0.05,
alpha = 0.5,
linetype = 2) +
geom_point2(size = 2, alpha = 0.7) +
scale_x_log10() +
xlab("Total Abundance") +
ylab("Prevalence [Frac. Samples]") +
facet_wrap( ~ Phylum) +
theme(legend.position = "none")Plot general prevalence features of the top 20 orders
# Subset to the remaining phyla
prevdf_order_filt <- subset(prevdf,
Order %in% get_taxa_unique(Ps_obj_SIP_merged_filt2, "Order"))
# grab the top 30 most abundant orders
prevdf_order_filt %>%
group_by(Order) %>%
summarise(Combined.abundance = sum(TotalAbundance)) %>%
arrange(desc(Combined.abundance)) %>%
.[1:30, "Order"] ->
Orders2plot
prevdf_order_filt2 <- subset(prevdf,
Order %in% Orders2plot$Order)
ggplot(prevdf_order_filt2,
aes(TotalAbundance,
Prevalence / nsamples(Ps_obj_SIP_merged_filt2),
color = Order)) +
# Include a guess for parameter
geom_hline(yintercept = 0.05,
alpha = 0.5,
linetype = 2) +
geom_point2(size = 2, alpha = 0.7) +
scale_x_log10() +
xlab("Total Abundance") +
ylab("Prevalence [Frac. Samples]") +
facet_wrap( ~ Order) +
theme(legend.position = "none")I’ll remove all sequences which appear in less than 5% of the samples
# Define prevalence threshold as 5% of total samples
prevalenceThreshold <- prev_thresh * nsamples(Ps_obj_SIP_merged_filt2)
prevalenceThreshold## [1] 30
# Execute prevalence filter, using `prune_taxa()` function
keepTaxa <-
row.names(prevdf_phylum_filt)[(prevdf_phylum_filt$Prevalence >= prevalenceThreshold)]
Ps_obj_SIP_merged_filt3 <- prune_taxa(keepTaxa, Ps_obj_SIP_merged_filt2)
sample_data(Ps_obj_SIP_merged_filt3)$Lib.size <- rowSums(otu_table(Ps_obj_SIP_merged_filt3))
print(Ps_obj_SIP_merged_filt2)## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 14099 taxa and 300 samples ]:
## sample_data() Sample Data: [ 300 samples by 29 sample variables ]:
## tax_table() Taxonomy Table: [ 14099 taxa by 6 taxonomic ranks ]:
## taxa are columns
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 3693 taxa and 300 samples ]:
## sample_data() Sample Data: [ 300 samples by 29 sample variables ]:
## tax_table() Taxonomy Table: [ 3693 taxa by 6 taxonomic ranks ]:
## taxa are columns
This removed 10406 or 74% of the ESVs!
However all these removed ESVs accounted for only:
prevdf_phylum_filt %>%
arrange(., Prevalence) %>%
group_by(Prevalence > prevalenceThreshold) %>%
summarise(Abundance = sum(TotalAbundance)) %>%
mutate(`Rel. Ab.` = percent(Abundance / sum(Abundance), accuracy = 0.01)) %>%
kable(., digits = c(0, 1, 0)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F)|
Prevalence > prevalenceThreshold |
Abundance |
Rel. Ab. |
|---|---|---|
|
FALSE |
303010 |
2.37% |
|
TRUE |
12498417 |
97.63% |
So it’s fine to remove them.
Test again the effect of library size and all other experimental factors on the community composition after filtering
(mod1 <- adonis(vegdist(otu_table(Ps_obj_SIP_merged_filt3), method = "bray") ~ Site * Oxygen * Hours + Lib.size,
data = as(sample_data(Ps_obj_SIP_merged_filt3), "data.frame"),
permutations = 999
))##
## Call:
## adonis(formula = vegdist(otu_table(Ps_obj_SIP_merged_filt3), method = "bray") ~ Site * Oxygen * Hours + Lib.size, data = as(sample_data(Ps_obj_SIP_merged_filt3), "data.frame"), permutations = 999)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## Site 1 22.638 22.6379 342.15 0.40294 0.001 ***
## Oxygen 1 6.358 6.3575 96.09 0.11316 0.001 ***
## Hours 1 0.322 0.3223 4.87 0.00574 0.005 **
## Lib.size 1 5.069 5.0686 76.61 0.09022 0.001 ***
## Site:Oxygen 1 1.475 1.4747 22.29 0.02625 0.001 ***
## Site:Hours 1 0.458 0.4576 6.92 0.00814 0.001 ***
## Oxygen:Hours 1 0.264 0.2643 3.99 0.00470 0.007 **
## Site:Oxygen:Hours 1 0.345 0.3448 5.21 0.00614 0.002 **
## Residuals 291 19.254 0.0662 0.34270
## Total 299 56.181 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
saveRDS(Ps_obj_SIP_merged_filt3, file = paste0(data_path, str_remove(Ps_file, ".Rds"), "_filt3.Rds"))
readDNAStringSet(paste0(data_path, Seq_file)) %>%
.[taxa_names(Ps_obj_SIP_merged_filt3)] %>%
writeXStringSet(., filepath = paste0(data_path, str_remove(Seq_file, ".fa*"), "_filtered.fa"), format = "fasta", width = 1000)
─ Session info ─────────────────────────────────────────────────────────────────────────
setting value
version R version 4.0.3 (2020-10-10)
os Ubuntu 18.04.5 LTS
system x86_64, linux-gnu
ui X11
language (EN)
collate en_US.UTF-8
ctype en_US.UTF-8
tz Europe/Prague
date 2021-03-02
─ Packages ─────────────────────────────────────────────────────────────────────────────
package * version date lib source
ade4 1.7-16 2020-10-28 [1] CRAN (R 4.0.2)
affy 1.66.0 2020-04-27 [1] Bioconductor
affyio 1.58.0 2020-04-27 [1] Bioconductor
ape 5.4-1 2020-08-13 [1] CRAN (R 4.0.2)
assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.0.2)
backports 1.2.1 2020-12-09 [1] CRAN (R 4.0.2)
bayestestR 0.8.2 2021-01-26 [1] CRAN (R 4.0.3)
Biobase * 2.48.0 2020-04-27 [1] Bioconductor
BiocGenerics * 0.34.0 2020-04-27 [1] Bioconductor
BiocManager 1.30.10 2019-11-16 [1] CRAN (R 4.0.2)
biomformat 1.16.0 2020-04-27 [1] Bioconductor
Biostrings * 2.56.0 2020-04-27 [1] Bioconductor
broom 0.7.5 2021-02-19 [1] CRAN (R 4.0.3)
cellranger 1.1.0 2016-07-27 [1] CRAN (R 4.0.2)
cli 2.3.1 2021-02-23 [1] CRAN (R 4.0.3)
clipr 0.7.1 2020-10-08 [1] CRAN (R 4.0.2)
cluster 2.1.1 2021-02-14 [1] CRAN (R 4.0.3)
coda 0.19-4 2020-09-30 [1] CRAN (R 4.0.2)
codetools 0.2-18 2020-11-04 [1] CRAN (R 4.0.2)
colorspace 2.0-0 2020-11-11 [1] CRAN (R 4.0.2)
crayon 1.4.1 2021-02-08 [1] CRAN (R 4.0.3)
data.table 1.14.0 2021-02-21 [1] CRAN (R 4.0.3)
DBI 1.1.1 2021-01-15 [1] CRAN (R 4.0.3)
dbplyr 2.1.0 2021-02-03 [1] CRAN (R 4.0.3)
desc 1.2.0 2018-05-01 [1] CRAN (R 4.0.2)
details 0.2.1 2020-01-12 [1] CRAN (R 4.0.2)
digest 0.6.27 2020-10-24 [1] CRAN (R 4.0.2)
dplyr * 1.0.4 2021-02-02 [1] CRAN (R 4.0.3)
effectsize 0.4.3 2021-01-18 [1] CRAN (R 4.0.3)
ellipsis 0.3.1 2020-05-15 [1] CRAN (R 4.0.2)
emmeans 1.5.4 2021-02-03 [1] CRAN (R 4.0.3)
estimability 1.3 2018-02-11 [1] CRAN (R 4.0.2)
evaluate 0.14 2019-05-28 [1] CRAN (R 4.0.2)
extrafont * 0.17 2014-12-08 [1] CRAN (R 4.0.2)
extrafontdb 1.0 2012-06-11 [1] CRAN (R 4.0.2)
fansi 0.4.2 2021-01-15 [1] CRAN (R 4.0.3)
farver 2.1.0 2021-02-28 [1] CRAN (R 4.0.3)
forcats * 0.5.1 2021-01-27 [1] CRAN (R 4.0.3)
foreach 1.5.1 2020-10-15 [1] CRAN (R 4.0.2)
fs 1.5.0 2020-07-31 [1] CRAN (R 4.0.2)
generics 0.1.0 2020-10-31 [1] CRAN (R 4.0.2)
ggplot2 * 3.3.3 2020-12-30 [1] CRAN (R 4.0.2)
ggridges 0.5.3 2021-01-08 [1] CRAN (R 4.0.2)
glue 1.4.2 2020-08-27 [1] CRAN (R 4.0.2)
gtable 0.3.0 2019-03-25 [1] CRAN (R 4.0.2)
haven 2.3.1 2020-06-01 [1] CRAN (R 4.0.2)
hexbin 1.28.2 2021-01-08 [1] CRAN (R 4.0.2)
highr 0.8 2019-03-20 [1] CRAN (R 4.0.2)
hms 1.0.0 2021-01-13 [1] CRAN (R 4.0.3)
htmltools 0.5.1.1 2021-01-22 [1] CRAN (R 4.0.3)
httr 1.4.2 2020-07-20 [1] CRAN (R 4.0.2)
igraph 1.2.6 2020-10-06 [1] CRAN (R 4.0.2)
insight 0.13.1 2021-02-22 [1] CRAN (R 4.0.3)
IRanges * 2.22.2 2020-05-21 [1] Bioconductor
iterators 1.0.13 2020-10-15 [1] CRAN (R 4.0.2)
jsonlite 1.7.2 2020-12-09 [1] CRAN (R 4.0.2)
kableExtra * 1.3.4 2021-02-20 [1] CRAN (R 4.0.3)
knitr 1.31 2021-01-27 [1] CRAN (R 4.0.3)
labeling 0.4.2 2020-10-20 [1] CRAN (R 4.0.2)
lattice * 0.20-41 2020-04-02 [1] CRAN (R 4.0.2)
lifecycle 1.0.0 2021-02-15 [1] CRAN (R 4.0.3)
limma 3.44.3 2020-06-12 [1] Bioconductor
lubridate 1.7.10 2021-02-26 [1] CRAN (R 4.0.3)
magrittr * 2.0.1 2020-11-17 [1] CRAN (R 4.0.2)
MASS 7.3-53.1 2021-02-12 [1] CRAN (R 4.0.3)
Matrix 1.3-2 2021-01-06 [1] CRAN (R 4.0.2)
mgcv 1.8-34 2021-02-16 [1] CRAN (R 4.0.3)
modelr 0.1.8 2020-05-19 [1] CRAN (R 4.0.2)
multcomp 1.4-16 2021-02-08 [1] CRAN (R 4.0.3)
multtest 2.44.0 2020-04-27 [1] Bioconductor
munsell 0.5.0 2018-06-12 [1] CRAN (R 4.0.2)
mvtnorm 1.1-1 2020-06-09 [1] CRAN (R 4.0.2)
nlme 3.1-152 2021-02-04 [1] CRAN (R 4.0.3)
parameters 0.12.0 2021-02-21 [1] CRAN (R 4.0.3)
permute * 0.9-5 2019-03-12 [1] CRAN (R 4.0.2)
phyloseq * 1.32.0 2020-04-27 [1] Bioconductor
pillar 1.5.0 2021-02-22 [1] CRAN (R 4.0.3)
pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.0.2)
plyr 1.8.6 2020-03-03 [1] CRAN (R 4.0.2)
png 0.1-7 2013-12-03 [1] CRAN (R 4.0.2)
preprocessCore 1.50.0 2020-04-27 [1] Bioconductor
prettyunits 1.1.1 2020-01-24 [1] CRAN (R 4.0.2)
progress 1.2.2 2019-05-16 [1] CRAN (R 4.0.2)
purrr * 0.3.4 2020-04-17 [1] CRAN (R 4.0.2)
R6 2.5.0 2020-10-28 [1] CRAN (R 4.0.2)
RColorBrewer 1.1-2 2014-12-07 [1] CRAN (R 4.0.2)
Rcpp 1.0.6 2021-01-15 [1] CRAN (R 4.0.3)
readr * 1.4.0 2020-10-05 [1] CRAN (R 4.0.2)
readxl 1.3.1 2019-03-13 [1] CRAN (R 4.0.2)
reprex 1.0.0 2021-01-27 [1] CRAN (R 4.0.3)
reshape2 1.4.4 2020-04-09 [1] CRAN (R 4.0.2)
rhdf5 2.32.4 2020-10-05 [1] Bioconductor
Rhdf5lib 1.10.1 2020-07-09 [1] Bioconductor
rlang 0.4.10 2020-12-30 [1] CRAN (R 4.0.2)
rmarkdown 2.7 2021-02-19 [1] CRAN (R 4.0.3)
rprojroot 2.0.2 2020-11-15 [1] CRAN (R 4.0.2)
rstudioapi 0.13 2020-11-12 [1] CRAN (R 4.0.2)
Rttf2pt1 1.3.8 2020-01-10 [1] CRAN (R 4.0.2)
rvest 0.3.6 2020-07-25 [1] CRAN (R 4.0.2)
S4Vectors * 0.26.1 2020-05-16 [1] Bioconductor
sandwich 3.0-0 2020-10-02 [1] CRAN (R 4.0.2)
scales * 1.1.1 2020-05-11 [1] CRAN (R 4.0.2)
see * 0.6.2 2021-02-04 [1] CRAN (R 4.0.3)
sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 4.0.2)
speedyseq * 0.5.3.9001 2020-10-27 [1] Github (mikemc/speedyseq@8daed32)
stringi 1.5.3 2020-09-09 [1] CRAN (R 4.0.2)
stringr * 1.4.0 2019-02-10 [1] CRAN (R 4.0.2)
survival 3.2-7 2020-09-28 [1] CRAN (R 4.0.2)
svglite * 2.0.0 2021-02-20 [1] CRAN (R 4.0.3)
systemfonts 1.0.1 2021-02-09 [1] CRAN (R 4.0.3)
TH.data 1.0-10 2019-01-21 [1] CRAN (R 4.0.2)
tibble * 3.1.0 2021-02-25 [1] CRAN (R 4.0.3)
tidyr * 1.1.2 2020-08-27 [1] CRAN (R 4.0.2)
tidyselect 1.1.0 2020-05-11 [1] CRAN (R 4.0.2)
tidyverse * 1.3.0 2019-11-21 [1] CRAN (R 4.0.2)
utf8 1.1.4 2018-05-24 [1] CRAN (R 4.0.2)
vctrs 0.3.6 2020-12-17 [1] CRAN (R 4.0.2)
vegan * 2.5-7 2020-11-28 [1] CRAN (R 4.0.2)
viridisLite 0.3.0 2018-02-01 [1] CRAN (R 4.0.2)
vsn * 3.56.0 2020-04-27 [1] Bioconductor
webshot 0.5.2 2019-11-22 [1] CRAN (R 4.0.2)
withr 2.4.1 2021-01-26 [1] CRAN (R 4.0.3)
xfun 0.21 2021-02-10 [1] CRAN (R 4.0.3)
xml2 1.3.2 2020-04-23 [1] CRAN (R 4.0.2)
xtable 1.8-4 2019-04-21 [1] CRAN (R 4.0.2)
XVector * 0.28.0 2020-04-27 [1] Bioconductor
yaml 2.2.1 2020-02-01 [1] CRAN (R 4.0.2)
zlibbioc 1.34.0 2020-04-27 [1] Bioconductor
zoo 1.8-8 2020-05-02 [1] CRAN (R 4.0.2)
[1] /home/angel/R/library
[2] /usr/local/lib/R/site-library
[3] /usr/lib/R/site-library
[4] /usr/lib/R/library